.libPaths('C:\\1\\R')
library('Rcpp')
library('ggplot2')
library('dplyr')
library('reshape2')
library('spdep')
library('RColorBrewer')
library('ggrepel')
library('viridis')
library('SSN')
library('SSNdesign')
library('rstan')
library('ggmcmc')
library('StanHeaders')
library('knitr')
library('plotly')

In this tutorial fit a Bayesian tail-up exponential spatial stream network model that has been implemented using likelihood methods in the R package SSN (Hoef et al. 2014).

Model

\[ y = X\beta + \varepsilon \]

For a pair of sites \(s_i\), \(s_j\) the tail-up covariance matrix is

\[C_{TU}(s_i,s_j|\theta) = \left\{\begin{matrix} 0 \:\:\: \textrm{if $s_i,s_j$ are flow unconnected} \\W_{ij} C_t \:\:\: \textrm{if $s_i,s_j$ are flow connected} \end{matrix}\right.\]

The matrix \(W_{ij}\) contains the spatial weights defined by the branching structure of the network.

The tail-up exponential is given by \(C_t\left (h| \theta \right ) = \sigma^2_{u}e^{-3h/\alpha}\)

where \(\sigma^2_{u}\) is the partial sill and \(\alpha\) is the range parameter.

\(Var(\varepsilon) = \Sigma\) where \(\Sigma = C_t \odot W_{ij} + \sigma^2_{NU}I\).

Creating an artificial dataset

path <- "./bayes32.ssn" 
set.seed(20200618)
n <- createSSN(
  n = c(300), 
  obsDesign = poissonDesign(c(0.5)),
  predDesign = poissonDesign(c(0.33)),
  importToR = TRUE,
  path = path,
  treeFunction = iterativeTreeLayout
)

This stream network is composed by following observation points:

## [1] 151

and prediction locations:

## [1] 104
col <- brewer.pal(9,'YlGnBu')[6]
plot(
  n,
  lwdLineCol = "addfunccol",
  lwdLineEx = 8,
  lineCol = col,
  col = 1,
  pch = 16,
  cex = 1,
  xlab = "x-coordinate",
  ylab = "y-coordinate"
)

plot(n, PredPointsID = "preds", add = T, pch = 15, cex = 1, col = "#E41A1C")

createDistMat(n, "preds", o.write=TRUE, amongpred = TRUE)
rawDFobs <- getSSNdata.frame(n, "Obs")
rawDFpred <- getSSNdata.frame(n, "preds")

# generating continous covariates
rawDFobs[,"X1"] <- rnorm(length(rawDFobs[,1]))
rawDFpred[,"X1"] <- rnorm(length(rawDFpred[,1]))
rawDFobs[,"X2"] <- rnorm(length(rawDFobs[,1]))
rawDFpred[,"X2"] <- rnorm(length(rawDFpred[,1]))
rawDFobs[,"X3"] <- rnorm(length(rawDFobs[,1]))
rawDFpred[,"X3"] <- rnorm(length(rawDFpred[,1]))
n <- putSSNdata.frame(rawDFobs,n, Name = 'Obs')
n <- putSSNdata.frame(rawDFpred,n, Name = 'preds')

Setting the parameters \(\sigma^2_{TU} = 3\), \(\alpha = 10\) and \(\sigma^2_{NU} = 0.1\). Using an intercept \(\beta_0= 10\) and slopes \(\beta_{1,2,3} = \{1,0,-1\}\).

set.seed(66)
sim.out <- SimulateOnSSN(
  n,
  ObsSimDF = rawDFobs,
  PredSimDF = rawDFpred,
  PredID = "preds",
  formula = ~ X1 + X2 + X3,
  coefficients = c(10, 1, 0, -1),
  CorModels = c(
    "Exponential.tailup"),
  use.nugget = TRUE,
  CorParms = c(3, 10,.1),
  addfunccol = "addfunccol"
)
sim.out <- readRDS('sim.out_32.rds')
sim.ssn <- sim.out$ssn.object

simDFobs <- getSSNdata.frame(sim.ssn, "Obs")
simDFpred <- getSSNdata.frame(sim.ssn, "preds")

simpreds <- simDFpred[,"Sim_Values"]
simDFpred[,"Sim_Values"] <- NA
sim.ssn <- putSSNdata.frame(simDFpred, sim.ssn, "preds")

Fitting the model using SSN::glmssn() and obtaining the predictions.

The regression coefficients are well retrieved. The estimates of $^2_{TU} $ and \(\sigma^2_{NU}\) are quite precise, while \(\alpha\) is overestimated.

glmssn.out <- glmssn(Sim_Values ~ X1 + X2 + X3, sim.ssn,
                     CorModels = c("Exponential.tailup"),  
                     addfunccol = "addfunccol")
summary(glmssn.out)
## 
## Call:
## glmssn(formula = Sim_Values ~ X1 + X2 + X3, ssn.object = sim.ssn, 
##     CorModels = c("Exponential.tailup"), addfunccol = "addfunccol")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6983 -1.4411 -0.0251  1.0826  3.8906 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.31406    0.21018   49.07   <2e-16 ***
## X1           1.02921    0.07205   14.29   <2e-16 ***
## X2          -0.01472    0.06134   -0.24    0.811    
## X3          -0.98576    0.08894  -11.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Covariance Parameters:
##    Covariance.Model Parameter Estimate
##  Exponential.tailup   parsill   3.2647
##  Exponential.tailup     range  22.2954
##              Nugget   parsill   0.0999
## 
## Residual standard error: 1.834283
## Generalized R-squared: 0.7048008
glmssn.pred <- predict(glmssn.out, "preds")
predDF <- getSSNdata.frame(glmssn.pred, "preds")

How good are the predictions?

ggplot() + geom_point(aes(x = simpreds,
                                 y = predDF[, "Sim_Values"])) +
  geom_abline(intercept = 0, slope = 1)+
  xlab("True") +
  ylab("Predicted") +
  xlim(2.5,17.5)+
  ylim(2.5,17.5)+
  coord_fixed() +  
  theme_bw()

Creating the downstream hydrologic distances. Here the columns representing the FROM site and rows representing the TO site

createDistMat(n, "preds", o.write=TRUE, amongpred = TRUE)
do <- readRDS(paste0(path, '/distance/obs/dist.net1.RData')) # distance between observations
dp  <- readRDS(paste0(path, '/distance/preds/dist.net1.RData')) # dist between pred points
dpo <- readRDS(paste0(path, '/distance/preds/dist.net1.a.RData')) #dist observation to prediction
dop <- readRDS(paste0(path, '/distance/preds/dist.net1.b.RData')) #dist prediction to observation    
   

# matrix containing all downstream distances
D <- rbind(cbind(do, dpo), cbind(dop, dp))

# total distance
H <- D + base::t(D)

Creating data frames with the observations and predictions

obs_data <- getSSNdata.frame(sim.ssn, "Obs")
pred_data <- getSSNdata.frame(sim.ssn, "preds")

Obtaining the spatial weight matrix (W) of the observations and predictions based on the watershed area.

afv <- rbind(obs_data[c('pid', 'addfunccol')],
             pred_data[c('pid', 'addfunccol')])

# codes adapated from from SSN::glmssn()
nsofar <- 0
dist.junc <- matrix(0, nrow = length(afv[, 1]), ncol = length(afv[,1])) 
distmat <- D
ni <- length(distmat[1,])
ordpi <- order(as.numeric(rownames(distmat)))
dist.junc[(nsofar + 1):(nsofar + ni), (nsofar + 1):(nsofar + ni)] <-
  distmat[ordpi, ordpi, drop = F]
b.mat <- pmin(dist.junc, base::t(dist.junc))
dist.hydro <- as.matrix(dist.junc + base::t(dist.junc))
flow.con.mat <- 1 - (b.mat > 0) * 1

n.all <- ni #NB 
w.matrix <- sqrt(pmin(outer(afv[, 'addfunccol'],rep(1, times = n.all)),
                      base::t(outer(afv[, 'addfunccol'],rep(1, times = n.all) ))) /
                   pmax(outer(afv[, 'addfunccol'],rep(1, times = n.all)),
                        base::t(outer(afv[, 'addfunccol'], rep(1, times = n.all))))) *
  flow.con.mat

Bayesian model in Stan

We refer to this model as SSNbayes. Creating a list with the observations/predictions and covariates. Note that we use index arrays and slicing to include missing data in the prediction points dataset.

N <- nrow(obs_data) + nrow(pred_data)
data2 <- list(N = N, # obs + preds  points
             K = 4,  # ncol of design matrix
             y_obs = obs_data$Sim_Values, # y values in the obs df
             
             N_y_obs = nrow(obs_data),  # numb obs points
             N_y_mis = nrow(pred_data), # numb preds points
             i_y_obs = 1:nrow(obs_data), # index of obs points
             i_y_mis = (nrow(obs_data)+1):(nrow(obs_data) + nrow(pred_data)), # index of preds points
             
             X = rbind(cbind(1,obs_data[, c("X1", "X2", "X3")]), # obs + preds design matrix
                 cbind(1,pred_data[, c("X1", "X2", "X3")])),
             h = H, # total stream distance
             W = w.matrix, # spatial weights
             I = diag(1, N, N)  # diagonal matrix
) 

We code the Stan model using the tail up exponential structure:

ssn_model2 <- "
  data {
    int<lower=1> N; 
    int<lower=1> K; 
    matrix[N,K] X;
    
    int<lower = 0> N_y_obs; // number observed values
    int<lower = 0> N_y_mis; // number missing values
    int<lower = 1, upper = N_y_obs + N_y_mis> i_y_obs[N_y_obs]; // i index of observed
    int<lower = 1, upper = N_y_obs + N_y_mis> i_y_mis[N_y_mis]; // i index of missing
    vector[N_y_obs] y_obs; // 

    matrix[N, N] W ; // spatial weights
    matrix[N, N] h ; // total hydrological dist 
    matrix[N, N] I ; // diag matrix
  }
  
  parameters {
    real<lower=0> sigma_tu;
    real<lower=0> sigma_nug;
    vector[K] beta;
    real<lower=0> alpha;
    vector[N_y_mis] y_mis; //declaring the missing y
    
  }

  transformed parameters {
    vector[N] y;
    real<lower=0> var_tu; // parsil
    real<lower=0> var_nug; // nugget
    matrix[N, N] C_tu;
    matrix[N, N] C1;
    y[i_y_obs] = y_obs; 
    y[i_y_mis] = y_mis;
    
    var_tu = sigma_tu ^ 2;
    var_nug = sigma_nug ^ 2;
    C1 = var_tu * exp(- 3 * h / alpha); // tail up exponential model
    C_tu = C1 .* W; // Hadamard (element-wise) product

  }
   
  model {
    matrix[N, N] L_Sigma = cholesky_decompose(C_tu +  var_nug * I); // NB
    target += multi_normal_cholesky_lpdf(y | X * beta, L_Sigma ); 
    sigma_tu ~ cauchy(0,3);  // prior on sd tail-up model
    sigma_nug ~ cauchy(0,1); // prior on nugget effect
    alpha ~ uniform(0,25);    // NB: need to change this by uniform discrete with a log steps
   }
  
  "

We initialize using the paramaters estimates obtained using the SSN::glmssn() call made above:

ini <- function(){list(var_tu =  glmssn.out$estimates$theta[1],
                       alpha = glmssn.out$estimates$theta[2],
                       var_nug =  glmssn.out$estimates$theta[3]
)}
# this model should take approx 30 mins running
fit2 <- stan(model_code = ssn_model2,
            model_name = "ssn_model2",
            pars = c('var_tu', 'alpha', 'var_nug','beta', 'sigma_tu', 'sigma_nug', 'y'), 
            data = data2,
            init = ini,
            iter = 4000,  
            warmup = 2000,
            thin = 2, 
            chains = 3, 
            verbose = F,
            seed = 999,
            refresh = max(5000/100, 1)
) 

Parameter estimates and summary statistics:

stats <- summary(fit2)
stats <- stats$summary
knitr::kable(stats[1:7,])
mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
var_tu 3.1755614 0.0142915 0.4735066 2.3651773 2.8383124 3.1385710 3.4596220 4.2508098 1097.7287 1.004049
alpha 18.8508490 0.1468276 3.9807292 10.2824640 15.9709579 19.2763195 22.1702152 24.7262010 735.0377 1.001912
var_nug 0.1068068 0.0021839 0.0570671 0.0286625 0.0653186 0.0954218 0.1366471 0.2391214 682.8048 1.009291
beta[1] 10.2949676 0.0060626 0.2110082 9.8726865 10.1531553 10.2959287 10.4387988 10.6980608 1211.3775 1.002170
beta[2] 1.0389127 0.0023540 0.0758226 0.8894299 0.9888071 1.0399870 1.0893453 1.1899727 1037.4866 1.000586
beta[3] -0.0130249 0.0024405 0.0666217 -0.1437327 -0.0562815 -0.0132675 0.0315202 0.1156476 745.1964 1.010958
beta[4] -0.9880633 0.0031736 0.0926282 -1.1785926 -1.0466877 -0.9873243 -0.9259296 -0.8175289 851.8705 1.002319

Comparing the model predictions with the latent value:

ypred <- data.frame(stats[grep("y\\[", row.names(stats)),][(nrow(obs_data)+1):(nrow(obs_data) + nrow(pred_data)),])
ypred$ytrue <- simpreds

ypred$ypred_glmssn <- predDF[, "Sim_Values"]
# lm() no spatial
lm_fit <- lm(Sim_Values ~ X1 + X2 + X3, obs_data)
lm_pred <- stats::predict(lm_fit, newdata = pred_data[,c('X1' , 'X2' , 'X3')])
ypred$ypred_lm <- lm_pred

ypred$ypred_ssnbayes <- ypred$mean

ypred$id <- as.numeric(regmatches(rownames(ypred), gregexpr("[[:digit:]]+", rownames(ypred))))

ypredm <- melt(ypred[, c("id", "ytrue", "ypred_lm", "ypred_glmssn", "ypred_ssnbayes")],
               id.vars = c('id', "ytrue"))

Plotting the predictions obtained using the three different methods: lm(), glmssn() and SSNbayes.

p <- ggplot(ypredm) + 
  geom_errorbar(data = ypred, aes(x=ytrue, ymin=X97.5., ymax=X2.5.), col = 3, width=0.2, size=0.5, alpha = 0.25) + 
  geom_point(aes(x = ytrue, y = value, group = variable, col = variable)) +
  stat_smooth(geom='line', method = 'lm', aes(x = ytrue, y = value, group = variable, col = variable), se = F, alpha = 0.7) +
  geom_abline(intercept = 0, slope = 1)+
  scale_color_manual(values = c(1,2,3))+
  xlab("True") +
  ylab("Predicted") +
  xlim(2.5,17.5)+
  ylim(2.5,17.5)+
  coord_fixed() +  theme_bw()

ggplotly(p)

Comparing the RMSE produced by the diffent models:

(rmse_lm <- sqrt(mean( ( (ypred$ytrue) - ypred$ypred_lm)^2))) # using lm()
## [1] 1.805427
(rmse_glmssn <- sqrt(mean( ( (ypred$ytrue) - ypred$ypred_glmssn)^2))) # using glmssn() 
## [1] 1.424941
(rmse_ssnbayes <- sqrt(mean( ( (ypred$ytrue) - ypred$ypred_ssnbayes)^2))) # using SSNbayes() 
## [1] 1.411547

Conclusions

Future work

Checking convergence and MCMC diagnostics

source("C:\\1\\DBDA2E-utilities2.R") # from: Kruschke, J. K. (2015). Doing Bayesian Data Analysis

\(\sigma^2_{TU}\)

diagMCMC( codaObject=As.mcmc.list(fit2,pars=c("var_tu","alpha","var_nug")), parName="var_tu" ) # 

range \(\alpha\)

diagMCMC( codaObject=As.mcmc.list(fit2,pars=c("var_tu","alpha","var_nug")), parName="alpha" ) #

\(\sigma^2_{NU}\)

diagMCMC( codaObject=As.mcmc.list(fit2,pars=c("var_tu","alpha","var_nug")), parName="var_nug" ) 

Regression coefficients:

diagMCMC( codaObject=As.mcmc.list(fit2,pars="beta") , parName=c("beta[1]") ) 

diagMCMC( codaObject=As.mcmc.list(fit2,pars="beta") , parName=c("beta[2]") ) 

diagMCMC( codaObject=As.mcmc.list(fit2,pars="beta") , parName=c("beta[3]") ) 

diagMCMC( codaObject=As.mcmc.list(fit2,pars="beta") , parName=c("beta[4]") ) 

First prediction point \(y\):

diagMCMC( codaObject=As.mcmc.list(fit2,pars="y") , parName=c("y[152]") ) 

Bibliography

Hoef, Jay M. Ver, Erin E. Peterson, David Clifford, and Rohan Shah. 2014. “SSN: An R Package for Spatial Statistical Modeling on Stream Networks.” Journal of Statistical Software 56 (3): 1–45. http://www.jstatsoft.org/v56/i03/.